In [ ]:
What's for those recently in confucian wikipedias, what's

In [3]:
import pandas as pd
import numpy
from collections import defaultdict
import json
import statsmodels.api as sm
from matplotlib.pylab import style
style.use('fivethirtyeight')
%pylab inline
java_min_int = -2147483648
WIKIS =('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'dewiki', 'enwiki','kowiki')


Populating the interactive namespace from numpy and matplotlib

In [4]:
allrecs = pd.read_csv('snapshot_data/2014-10-13/gender-index-data-2014-10-13.csv',na_values=[java_min_int])
def split_column(q_str):
    if type(q_str) is float:
        if numpy.isnan(q_str):
            return q_str 
    if type(q_str) is str:
        qs = q_str.split('|')
        return qs[:-1] #cos the format will always end with a |

for col in ['gender','site_links']:
    allrecs[col] = allrecs[col].apply(split_column)

In [5]:
def has(xxwiki):
    def has_xx(row):
        if isinstance(row['site_links'], list):
            return xxwiki in row['site_links']
        else: return False
    return has_xx

generate files


In [6]:
allrecs.head(20)


Out[6]:
qid dob dod gender ethnic_group citizenship place_of_birth site_links
0 Q23 1732 1799 [Q6581097] NaN Q30| Q494413| [zhwiki, kywiki, euwiki, plwiki, bswiki, angwi...
1 Q42 1952 2001 [Q6581097] NaN Q145| Q350| [zhwiki, jvwiki, euwiki, plwiki, bswiki, eswik...
2 Q207 1946 NaN [Q6581097] NaN Q30| Q49145| [uzwiki, eswiki, kowikiquote, huwiki, liwikiqu...
3 Q297 NaN 1660 [Q6581097] NaN Q29| Q8717| [zhwiki, kywiki, plwiki, euwiki, bswiki, uzwik...
4 Q326 1942 NaN [Q6581097] NaN Q298|Q39| Q2887| [zhwiki, plwiki, euwiki, kowiki, frwiki, eswik...
5 Q368 1915 2006 [Q6581097] NaN Q298| Q33986| [lbwiki, zhwiki, plwiki, euwiki, bswiki, angwi...
6 Q377 1882 1942 [Q6581097] NaN Q34266|Q2895|Q15180| Q658871| [zhwiki, kywiki, ukwikisource, jvwiki, plwiki,...
7 Q475 1911 1982 [Q6581097] NaN Q298| Q2887| [plwiki, euwiki, kowiki, frwiki, eswiki, yowik...
8 Q501 1821 1867 [Q6581097] NaN Q142| Q90| [zhwiki, glwikisource, plwiki, euwiki, bswiki,...
9 Q530 1956 NaN [Q6581097] NaN Q34| Q499415| [plwiki, euwiki, frwiki, bswiki, bewiki, eswik...
10 Q555 1973 NaN [Q6581072] NaN Q30| Q1020700| [zhwiki, eowiki, plwiki, kowiki, ruwiki, frwik...
11 Q619 1473 1543 [Q6581097] NaN Q36| Q47554| [szlwiki, zhwiki, kywiki, plwiki, euwiki, bswi...
12 Q633 1945 NaN [Q6581097] NaN Q16| Q172| [euwikiquote, zhwiki, plwiki, euwiki, eswiki, ...
13 Q635 -68 -29 [Q6581072] NaN Q11768| Q87| [zhwiki, plwiki, euwiki, bswiki, uzwiki, eswik...
14 Q747 1606 1684 [Q6581097] NaN Q70972| Q30974| [lbwiki, zhwiki, plwiki, euwiki, bswiki, eswik...
15 Q815 1898 1980 [Q6581097] NaN Q298| Q14467| [ptwiki, plwiki, ruwiki, kowiki, frwiki, enwik...
16 Q849 1431 1463 [Q6581097] NaN Q70972| Q90| [zhwiki, plwiki, euwiki, ptwikisource, eswiki,...
17 Q853 1932 1986 [Q6581097] NaN Q15180| Q15651436| [zhwiki, plwiki, euwiki, eswiki, afwiki, ocwik...
18 Q859 -427 -347 [Q6581097] NaN Q844930| Q1779520| [uzwiki, bhwiki, eswiki, ptwikisource, huwiki,...
19 Q873 1949 NaN [Q6581072] NaN Q30| Q1000642| [lbwiki, zhwiki, jvwiki, plwiki, euwiki, bswik...

In [40]:
def makedecades(b,e):
    for y in range(b, e, 10):
        yield y, y+10
        
def isfemale(x):
    if isinstance(x, list):
        return x[0] == 'Q6581072'
    else: return False
def ismale(x):
    if isinstance(x, list):
        return x[0] == 'Q6581097'
    else: return False
def nogender(x):
    if not isinstance(x, list):
        return True
    else: return False
        
for xxwiki in WIKIS:
    has_wiki = has(xxwiki)
    recs = allrecs[allrecs.apply(has_wiki, axis=1)]
    for gender, gender_test in (('female', isfemale), ('male', ismale), ('nogender', nogender)):
        grecs = recs[recs['gender'].apply(gender_test)]

        for start_year, stop_year in  makedecades(1930,1990):
            modrecs = grecs[(grecs['dob'] >= start_year) &(grecs['dob'] < stop_year)]
            #print len(modrecs), xxwiki, start_year
            filepath = 'helpers/inspection/{}_{}_{}.json'.format(xxwiki, start_year,gender)
            json.dump(list(modrecs['qid']), open(filepath,'w'))
            !scp $filepath wmflabs-tools:/home/maximilianklein/inspectionshortcut


 772 jawiki 1930
1202 jawiki 1940
1457 jawiki 1950
2194 jawiki 1960
3138 jawiki 1970
4304 jawiki 1980
4718 jawiki 1930
6520 jawiki 1940
6555 jawiki 1950
7536 jawiki 1960
9715 jawiki 1970
11830 jawiki 1980
340 jawiki 1930
422 jawiki 1940
379 jawiki 1950
376 jawiki 1960
379 jawiki 1970
305 jawiki 1980
177 tlwiki 1930
235 tlwiki 1940
294 tlwiki 1950
427 tlwiki 1960
625 tlwiki 1970
837 tlwiki 1980
359 tlwiki 1930
480 tlwiki 1940
479 tlwiki 1950
557 tlwiki 1960
598 tlwiki 1970
683 tlwiki 1980
0 tlwiki 1930
1 tlwiki 1940
2 tlwiki 1950
2 tlwiki 1960
6 tlwiki 1970
3 tlwiki 1980
109 urwiki 1930
156 urwiki 1940
189 urwiki 1950
229 urwiki 1960
422 urwiki 1970
697 urwiki 1980
176 urwiki 1930
199 urwiki 1940
176 urwiki 1950
142 urwiki 1960
126 urwiki 1970
133 urwiki 1980
0 urwiki 1930
0 urwiki 1940
0 urwiki 1950
1 urwiki 1960
0 urwiki 1970
1 urwiki 1980
307 zhwiki 1930
468 zhwiki 1940
717 zhwiki 1950
996 zhwiki 1960
1637 zhwiki 1970
3014 zhwiki 1980
2448 zhwiki 1930
3339 zhwiki 1940
3495 zhwiki 1950
3794 zhwiki 1960
4753 zhwiki 1970
6477 zhwiki 1980
257 zhwiki 1930
389 zhwiki 1940
624 zhwiki 1950
543 zhwiki 1960
257 zhwiki 1970
299 zhwiki 1980
4101 dewiki 1930
7107 dewiki 1940
8671 dewiki 1950
10194 dewiki 1960
10401 dewiki 1970
10982 dewiki 1980
31003 dewiki 1930
37919 dewiki 1940
35954 dewiki 1950
33389 dewiki 1960
28342 dewiki 1970
29896 dewiki 1980
0 dewiki 1930
0 dewiki 1940
1 dewiki 1950
1 dewiki 1960
0 dewiki 1970
0 dewiki 1980
7338 enwiki 1930
12475 enwiki 1940
15750 enwiki 1950
17603 enwiki 1960
20983 enwiki 1970
24977 enwiki 1980
50168 enwiki 1930
67619 enwiki 1940
71683 enwiki 1950
71914 enwiki 1960
78598 enwiki 1970
96430 enwiki 1980
48 enwiki 1930
63 enwiki 1940
88 enwiki 1950
88 enwiki 1960
92 enwiki 1970
93 enwiki 1980
215 kowiki 1930
397 kowiki 1940
552 kowiki 1950
920 kowiki 1960
1454 kowiki 1970
2056 kowiki 1980
1405 kowiki 1930
2172 kowiki 1940
2240 kowiki 1950
2913 kowiki 1960
3851 kowiki 1970
5287 kowiki 1980
8 kowiki 1930
13 kowiki 1940
20 kowiki 1950
24 kowiki 1960
37 kowiki 1970
34 kowiki 1980

then wait for the remote 200word summary task on labs under viafbot/inspection


In [42]:
!mkdir helpers/inspection/expanded_descriptions

In [78]:
!scp wmflabs-tools:/home/maximilianklein/inspectionshortcut/output/* helpers/inspection/expanded_descriptions/.


If you are having access problems, please see: https://wikitech.wikimedia.org/wiki/Access#Accessing_public_and_private_instances
dewiki_1930_female_descriptions.json          100% 1054KB   1.0MB/s   00:01    
dewiki_1930_male_descriptions.json            100% 2604KB 650.9KB/s   00:04    
dewiki_1940_female_descriptions.json          100% 1824KB 456.0KB/s   00:04    
dewiki_1940_male_descriptions.json            100% 2556KB 638.9KB/s   00:04    
dewiki_1950_female_descriptions.json          100% 2225KB 556.2KB/s   00:04    
dewiki_1950_male_descriptions.json            100% 2562KB 512.3KB/s   00:05    
dewiki_1950_nogender_descriptions.json        100%  263     0.3KB/s   00:00    
dewiki_1960_female_descriptions.json          100% 2563KB 640.8KB/s   00:04    
dewiki_1960_male_descriptions.json            100% 2581KB 645.1KB/s   00:04    
dewiki_1960_nogender_descriptions.json        100%  257     0.3KB/s   00:00    
dewiki_1970_female_descriptions.json          100% 2546KB 636.5KB/s   00:04    
dewiki_1970_male_descriptions.json            100% 2574KB 643.6KB/s   00:04    
dewiki_1980_female_descriptions.json          100% 2544KB 508.7KB/s   00:05    
dewiki_1980_male_descriptions.json            100% 2595KB 432.5KB/s   00:06    
enwiki_1930_female_descriptions.json          100% 1858KB 619.2KB/s   00:03    
enwiki_1930_male_descriptions.json            100% 2505KB 626.2KB/s   00:04    
enwiki_1930_nogender_descriptions.json        100%   12KB  12.2KB/s   00:00    
enwiki_1940_female_descriptions.json          100% 2528KB 842.7KB/s   00:03    
enwiki_1940_male_descriptions.json            100% 2509KB 836.2KB/s   00:03    
enwiki_1940_nogender_descriptions.json        100%   16KB  16.0KB/s   00:00    
enwiki_1950_female_descriptions.json          100% 2505KB 626.3KB/s   00:04    
enwiki_1950_male_descriptions.json            100% 2528KB 632.0KB/s   00:04    
enwiki_1950_nogender_descriptions.json        100%   22KB  21.9KB/s   00:00    
enwiki_1960_female_descriptions.json          100% 2524KB 631.1KB/s   00:04    
enwiki_1960_male_descriptions.json            100% 2518KB 629.6KB/s   00:04    
enwiki_1960_nogender_descriptions.json        100%   21KB  21.2KB/s   00:00    
enwiki_1970_female_descriptions.json          100% 2534KB 633.4KB/s   00:04    
enwiki_1970_male_descriptions.json            100% 2530KB 632.4KB/s   00:04    
enwiki_1970_nogender_descriptions.json        100%   22KB  22.2KB/s   00:00    
enwiki_1980_female_descriptions.json          100% 2494KB 623.6KB/s   00:04    
enwiki_1980_male_descriptions.json            100% 2528KB 632.0KB/s   00:04    
enwiki_1980_nogender_descriptions.json        100%   22KB  22.4KB/s   00:00    
jawiki_1930_female_descriptions.json          100%  228KB 228.1KB/s   00:01    
jawiki_1930_male_descriptions.json            100% 1426KB 712.8KB/s   00:02    
jawiki_1930_nogender_descriptions.json        100%  354KB 354.2KB/s   00:01    
jawiki_1940_female_descriptions.json          100%  363KB 363.0KB/s   00:01    
jawiki_1940_male_descriptions.json            100% 1902KB 475.6KB/s   00:04    
jawiki_1940_nogender_descriptions.json        100%  443KB 443.5KB/s   00:01    
jawiki_1950_female_descriptions.json          100%  452KB 451.7KB/s   00:01    
jawiki_1950_male_descriptions.json            100% 1904KB 634.6KB/s   00:03    
jawiki_1950_nogender_descriptions.json        100%  398KB 398.3KB/s   00:01    
jawiki_1960_female_descriptions.json          100%  681KB 340.5KB/s   00:02    
jawiki_1960_male_descriptions.json            100% 2177KB 544.2KB/s   00:04    
jawiki_1960_nogender_descriptions.json        100%  394KB 394.4KB/s   00:01    
jawiki_1970_female_descriptions.json          100%  984KB 492.0KB/s   00:02    
jawiki_1970_male_descriptions.json            100% 2705KB 541.0KB/s   00:05    
jawiki_1970_nogender_descriptions.json        100%  387KB 387.5KB/s   00:01    
jawiki_1980_female_descriptions.json          100% 1522KB 760.9KB/s   00:02    
jawiki_1980_male_descriptions.json            100% 2710KB 677.5KB/s   00:04    
jawiki_1980_nogender_descriptions.json        100%  296KB 296.5KB/s   00:00    
kowiki_1930_female_descriptions.json          100%   56KB  56.4KB/s   00:01    
kowiki_1930_male_descriptions.json            100%  368KB 367.7KB/s   00:00    
kowiki_1930_nogender_descriptions.json        100% 6241     6.1KB/s   00:01    
kowiki_1940_female_descriptions.json          100%  102KB 102.4KB/s   00:00    
kowiki_1940_male_descriptions.json            100%  567KB 567.0KB/s   00:01    
kowiki_1940_nogender_descriptions.json        100%   10KB  10.2KB/s   00:00    
kowiki_1950_female_descriptions.json          100%  144KB 144.3KB/s   00:00    
kowiki_1950_male_descriptions.json            100%  584KB 292.2KB/s   00:02    
kowiki_1950_nogender_descriptions.json        100%   16KB  15.8KB/s   00:00    
kowiki_1960_female_descriptions.json          100%  236KB 235.8KB/s   00:01    
kowiki_1960_male_descriptions.json            100%  751KB 375.6KB/s   00:02    
kowiki_1960_nogender_descriptions.json        100%   18KB  17.7KB/s   00:00    
kowiki_1970_female_descriptions.json          100%  375KB 374.6KB/s   00:01    
kowiki_1970_male_descriptions.json            100%  984KB 491.8KB/s   00:02    
kowiki_1970_nogender_descriptions.json        100%   27KB  26.6KB/s   00:00    
kowiki_1980_female_descriptions.json          100%  548KB 548.4KB/s   00:01    
kowiki_1980_male_descriptions.json            100% 1360KB 679.9KB/s   00:02    
kowiki_1980_nogender_descriptions.json        100%   22KB  21.8KB/s   00:00    
tlwiki_1930_female_descriptions.json          100%   44KB  43.8KB/s   00:00    
tlwiki_1930_male_descriptions.json            100%   90KB  89.9KB/s   00:00    
tlwiki_1940_female_descriptions.json          100%   58KB  58.4KB/s   00:00    
tlwiki_1940_male_descriptions.json            100%  120KB 119.9KB/s   00:01    
tlwiki_1940_nogender_descriptions.json        100%  143     0.1KB/s   00:00    
tlwiki_1950_female_descriptions.json          100%   73KB  73.3KB/s   00:00    
tlwiki_1950_male_descriptions.json            100%  119KB 118.6KB/s   00:01    
tlwiki_1950_nogender_descriptions.json        100%  396     0.4KB/s   00:00    
tlwiki_1960_female_descriptions.json          100%  106KB 106.1KB/s   00:00    
tlwiki_1960_male_descriptions.json            100%  138KB 137.6KB/s   00:01    
tlwiki_1960_nogender_descriptions.json        100%  286     0.3KB/s   00:00    
tlwiki_1970_female_descriptions.json          100%  155KB 155.0KB/s   00:01    
tlwiki_1970_male_descriptions.json            100%  149KB 148.9KB/s   00:00    
tlwiki_1970_nogender_descriptions.json        100%  854     0.8KB/s   00:00    
tlwiki_1980_female_descriptions.json          100%  207KB 207.2KB/s   00:00    
tlwiki_1980_male_descriptions.json            100%  168KB 168.3KB/s   00:01    
tlwiki_1980_nogender_descriptions.json        100%  440     0.4KB/s   00:00    
urwiki_1930_female_descriptions.json          100%   27KB  27.2KB/s   00:00    
urwiki_1930_male_descriptions.json            100%   45KB  45.5KB/s   00:00    
urwiki_1940_female_descriptions.json          100%   39KB  39.5KB/s   00:00    
urwiki_1940_male_descriptions.json            100%   52KB  51.6KB/s   00:00    
urwiki_1950_female_descriptions.json          100%   47KB  47.0KB/s   00:00    
urwiki_1950_male_descriptions.json            100%   47KB  47.0KB/s   00:00    
urwiki_1960_female_descriptions.json          100%   57KB  57.4KB/s   00:01    
urwiki_1960_male_descriptions.json            100%   37KB  37.2KB/s   00:00    
urwiki_1960_nogender_descriptions.json        100% 1113     1.1KB/s   00:00    
urwiki_1970_female_descriptions.json          100%  114KB 113.7KB/s   00:00    
urwiki_1970_male_descriptions.json            100%   33KB  33.2KB/s   00:01    
urwiki_1980_female_descriptions.json          100%  196KB 195.5KB/s   00:00    
urwiki_1980_male_descriptions.json            100%   33KB  33.3KB/s   00:00    
urwiki_1980_nogender_descriptions.json        100% 1036     1.0KB/s   00:00    
zhwiki_1930_female_descriptions.json          100%   95KB  95.4KB/s   00:00    
zhwiki_1930_male_descriptions.json            100%  683KB 341.6KB/s   00:02    
zhwiki_1930_nogender_descriptions.json        100%  251KB 251.2KB/s   00:01    
zhwiki_1940_female_descriptions.json          100%  148KB 148.0KB/s   00:01    
zhwiki_1940_male_descriptions.json            100%  908KB 302.6KB/s   00:03    
zhwiki_1940_nogender_descriptions.json        100%  363KB 362.6KB/s   00:01    
zhwiki_1950_female_descriptions.json          100%  209KB 209.2KB/s   00:00    
zhwiki_1950_male_descriptions.json            100%  985KB 492.6KB/s   00:02    
zhwiki_1950_nogender_descriptions.json        100%  580KB 289.8KB/s   00:02    
zhwiki_1960_female_descriptions.json          100%  307KB 307.3KB/s   00:01    
zhwiki_1960_male_descriptions.json            100% 1043KB   1.0MB/s   00:01    
zhwiki_1960_nogender_descriptions.json        100%  499KB 499.2KB/s   00:01    
zhwiki_1970_female_descriptions.json          100%  492KB 492.5KB/s   00:01    
zhwiki_1970_male_descriptions.json            100% 1283KB 641.4KB/s   00:02    
zhwiki_1970_nogender_descriptions.json        100%  243KB 242.7KB/s   00:01    
zhwiki_1980_female_descriptions.json          100%  990KB 495.0KB/s   00:02    
zhwiki_1980_male_descriptions.json            100% 1768KB 589.4KB/s   00:03    
zhwiki_1980_nogender_descriptions.json        100%  280KB 280.2KB/s   00:00    

In [7]:
description_files = !ls helpers/inspection/expanded_descriptions/
len(description_files)


Out[7]:
117

In [8]:
celebrity_dict = {'jawiki': [u'俳優', u'選手', u'歌手', u'ミュージシャン', u'モデル', u'アイドル'], 
                              'zhwiki': [u'演員', u'運動員', u'歌手', u'音乐家', u'模特兒', u'偶像'],
                              'tlwiki': [u'artista', 'aktor', u'player', u'mang-aawit', u'musikero', u'modelo', u'idolo'],
                              'urwiki': [u'اردو', u'کھلاڑ', u'گلوکار' , u'موسیقار' , u'ماڈل', u'بت'],
                              'dewiki': [u'schauspieler' , u'spieler', u'Musiker', u'Sänger', u'Modell', u'Idol'],
                              'enwiki' :[u'actor', u'actress', u'player', u'singer', u'musician', u'model', u'idol'],
                              'kowiki' : [u'배우', u'선수', u'가수', u'음악가', u'모델', u'우상']}

def intext(text, xxwiki):
    if text:
        text = text.encode('utf-8').lower()
        engwords = celebrity_dict['enwiki']
        foreignwords = celebrity_dict[xxwiki]
        for word in engwords + foreignwords:
            if word.encode('utf-8').lower() in text:
                return True
        #if we get to this point its too late
        return False
    else: return False


celebdf = pd.DataFrame(columns=['wiki','decade','gender','celeb_per'])
for f in description_files:
    parts = f.split('_')
    xxwiki, decade, gender = parts[0], parts[1], parts[2]
    df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/expanded_descriptions/{}'.format(f), 'r')), orient='index')
    df['celeb'] = df['text'].apply(lambda text: intext(text,xxwiki))
    test_per = df['celeb'].sum()/float(len(df))
    celebdf = celebdf.append({'wiki':xxwiki, 'decade':int(decade), 'gender':gender, 'celeb_per':test_per}, ignore_index=True)

In [4]:
dummy_langs = pd.get_dummies(celebdf['wiki'])
dummy_gender = pd.get_dummies(celebdf['gender'])
dummy_gender = dummy_gender[['male','female','nogender']]
print dummy_langs.head(2)
print dummy_gender.head(2)


   dewiki  enwiki  jawiki  kowiki  tlwiki  urwiki  zhwiki
0       1       0       0       0       0       0       0
1       1       0       0       0       0       0       0
   male  female  nogender
0     0       1         0
1     1       0         0

In [5]:
catdf = celebdf[['celeb_per','decade']].join(dummy_langs.ix[:,'enwiki':]).join(dummy_gender.ix[:,'female':'female'])
catdf['intercept'] = 1.0

In [6]:
catdf.corr()


Out[6]:
celeb_per decade enwiki jawiki kowiki tlwiki urwiki zhwiki female intercept
celeb_per 1.000000 0.284661 -0.290744 -0.079482 0.125096 0.606231 0.017558 -0.154101 0.460273 NaN
decade 0.284661 1.000000 -0.011882 -0.011882 -0.011882 0.024452 0.036556 -0.011882 -0.020852 NaN
enwiki -0.290744 -0.011882 1.000000 -0.181818 -0.181818 -0.175810 -0.157204 -0.181818 -0.022792 NaN
jawiki -0.079482 -0.011882 -0.181818 1.000000 -0.181818 -0.175810 -0.157204 -0.181818 -0.022792 NaN
kowiki 0.125096 -0.011882 -0.181818 -0.181818 1.000000 -0.175810 -0.157204 -0.181818 -0.022792 NaN
tlwiki 0.606231 0.024452 -0.175810 -0.175810 -0.175810 1.000000 -0.152009 -0.175810 -0.005186 NaN
urwiki 0.017558 0.036556 -0.157204 -0.157204 -0.157204 -0.152009 1.000000 -0.157204 0.053489 NaN
zhwiki -0.154101 -0.011882 -0.181818 -0.181818 -0.181818 -0.175810 -0.157204 1.000000 -0.022792 NaN
female 0.460273 -0.020852 -0.022792 -0.022792 -0.022792 -0.005186 0.053489 -0.022792 1.000000 NaN
intercept NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

In [7]:
train_cols = catdf.columns[1:]
logit = sm.Logit(catdf['celeb_per'], catdf[train_cols])
result= logit.fit()


Optimization terminated successfully.
         Current function value: 0.466279
         Iterations 6

In [19]:
result.summary()


Out[19]:
Logit Regression Results
Dep. Variable: celeb_per No. Observations: 117
Model: Logit Df Residuals: 108
Method: MLE Df Model: 8
Date: Wed, 07 Jan 2015 Pseudo R-squ.: 0.3079
Time: 11:25:05 Log-Likelihood: -54.555
converged: True LL-Null: -78.830
LLR p-value: 7.752e-08
coef std err z P>|z| [95.0% Conf. Int.]
decade 0.0236 0.013 1.823 0.068 -0.002 0.049
enwiki 0.0509 0.875 0.058 0.954 -1.664 1.766
jawiki 0.7763 0.837 0.927 0.354 -0.865 2.418
kowiki 1.3834 0.832 1.662 0.097 -0.248 3.015
tlwiki 3.0009 0.945 3.176 0.001 1.149 4.853
urwiki 0.8901 0.869 1.025 0.306 -0.813 2.593
zhwiki 0.5383 0.846 0.637 0.524 -1.119 2.196
female 1.3580 0.453 2.999 0.003 0.471 2.245
intercept -47.9056 25.368 -1.888 0.059 -97.626 1.815

In [17]:
subj_list = ['female','male','nogender']

fig, axes = plt.subplots(nrows = 1, ncols = len(subj_list), sharex='col', sharey='row')
for ax, subj in zip(axes, subj_list):

    natlangdf = celebdf[celebdf['gender'] == subj]
    natlangpiv = pd.pivot_table(natlangdf, values='celeb_per', rows='decade', cols='wiki')
    natlangpiv = natlangpiv[['jawiki','zhwiki','kowiki','tlwiki','urwiki','dewiki','enwiki']]
    natlangpiv.columns = ['Japanese', 'Chinese', 'Korean', 'Tagalog', 'Urdu', 'German', 'English']
    natlangpiv = natlangpiv * 100
    heatmap = ax.pcolor(natlangpiv, cmap='Purples', vmin=0, vmax=100)

    ax.set_yticks(np.arange(0.5, len(natlangpiv.index), 1))
    ax.set_yticklabels(map(int, natlangpiv.index))
    ax.set_xticks(np.arange(0.5, len(natlangpiv.columns), 1))
    ax.set_xticklabels(natlangpiv.columns, rotation=90)

fig.suptitle('''Heatmap of Celebrity Biography %, By Decade of Birth versus Wikipedia Language by Gender''', fontsize=18)
fig.set_size_inches(12,4,dpi=600)
#fig.tight_layout()

subj_titles = ['Female','Male','Not Recorded or Non-Binary']
metric_titles =['Decade'] 

cbar = plt.colorbar(mappable=heatmap, ax=ax, format="%.0f%%")

for i in range(len(subj_titles)):
    axes[i].set_title(subj_titles[i])

fig.subplots_adjust(wspace=0.0, hspace=0.0, top=0.85)



In [17]:
subplots_adjust


Out[17]:
<function matplotlib.pyplot.subplots_adjust>

In [11]:
actress_dict = {'jawiki': u'俳優', 'zhwiki': u'演員', 'tlwiki':u'artista', 'urwiki': u'اردو', 'dewiki': 'schauspieler' , 'enwiki' :'actress'}
player_dict =   {'jawiki': u'選手', 'zhwiki': u'運動員', 'tlwiki':u'player', 'urwiki': u'کھلاڑ',  'dewiki': 'spieler' , 'enwiki' :'player'}



def multiword(xxwiki, prof_dict):
    def intext(text):
        if text:
            text = text.lower()
            eng = prof_dict['enwiki']
            foreign = prof_dict[xxwiki]
            if eng in text or foreign in text:
                return True
            else: return False
        else: return False
    return intext


celeb = defaultdict(dict)
for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'):
    df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index')
    for prof, test in (('actress',  multiword(xxwiki, actress_dict)), ('player',multiword(xxwiki, player_dict))):
        df[prof] = df['text'].apply(test)
        test_per = df[prof].sum()/float(len(df))
        celeb[xxwiki][prof] = test_per

In [8]:
for xxwiki in ('jawiki', 'tlwiki', 'urwiki', 'zhwiki', 'enwiki', 'dewiki'):
    df = pd.DataFrame.from_dict(json.load(open('helpers/inspection/descriptions/{}_descriptions.json'.format(xxwiki), 'r')), orient='index')
    df['text'] = df['text'].apply(lambda x: x.replace('\n',' ') if x else x)
    df.to_csv('helpers/inspection/readable/{}_modern_bios_for_inspection.csv'.format(xxwiki), encoding='utf-8')

In [18]:
celebdf = pd.DataFrame.from_dict(celeb, orient='index')
celebdf['either'] = celebdf['player'] + celebdf['actress']
celebdf.sort('either')


Out[18]:
player actress either
enwiki 0.223995 0.042483 0.266478
jawiki 0.198402 0.090304 0.288705
dewiki 0.287200 0.075000 0.362200
zhwiki 0.251717 0.121909 0.373626
tlwiki 0.068075 0.419601 0.487676
urwiki 0.016148 0.741638 0.757785

In [19]:
celebdf['wiki'].convert_object(convert_dates=True)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-19-abc28e5c26e7> in <module>()
----> 1 celebdf['wiki'].convert_object(convert_dates=True)

/usr/local/lib/python2.7/dist-packages/pandas/core/generic.pyc in __getattr__(self, name)
   1945                 return self[name]
   1946             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1947                                  (type(self).__name__, name))
   1948 
   1949     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'convert_object'

In [14]:
celebdf


Out[14]:
wiki decade gender celeb_per
0 dewiki 1930 female 0.313582
1 dewiki 1930 male 0.147053
2 dewiki 1940 female 0.271985
3 dewiki 1940 male 0.162964
4 dewiki 1950 female 0.246338
5 dewiki 1950 male 0.186657
6 dewiki 1950 nogender 0.000000
7 dewiki 1960 female 0.323635
8 dewiki 1960 male 0.256650
9 dewiki 1960 nogender 0.000000
10 dewiki 1970 female 0.461909
11 dewiki 1970 male 0.328388
12 dewiki 1980 female 0.511078
13 dewiki 1980 male 0.331519
14 enwiki 1930 female 0.293813
15 enwiki 1930 male 0.167428
16 enwiki 1930 nogender 0.104167
17 enwiki 1940 female 0.262633
18 enwiki 1940 male 0.179598
19 enwiki 1940 nogender 0.079365
20 enwiki 1950 female 0.253739
21 enwiki 1950 male 0.202080
22 enwiki 1950 nogender 0.102273
23 enwiki 1960 female 0.332370
24 enwiki 1960 male 0.250150
25 enwiki 1960 nogender 0.159091
26 enwiki 1970 female 0.439121
27 enwiki 1970 male 0.296730
28 enwiki 1970 nogender 0.152174
29 enwiki 1980 female 0.494689
... ... ... ... ...
87 urwiki 1940 female 0.839744
88 urwiki 1940 male 0.075377
89 urwiki 1950 female 0.841270
90 urwiki 1950 male 0.107955
91 urwiki 1960 female 0.868996
92 urwiki 1960 male 0.161972
93 urwiki 1960 nogender 0.000000
94 urwiki 1970 female 0.902844
95 urwiki 1970 male 0.198413
96 urwiki 1980 female 0.928264
97 urwiki 1980 male 0.203008
98 urwiki 1980 nogender 0.000000
99 zhwiki 1930 female 0.371336
100 zhwiki 1930 male 0.191993
101 zhwiki 1930 nogender 0.070039
102 zhwiki 1940 female 0.275641
103 zhwiki 1940 male 0.225217
104 zhwiki 1940 nogender 0.043702
105 zhwiki 1950 female 0.358438
106 zhwiki 1950 male 0.269242
107 zhwiki 1950 nogender 0.038462
108 zhwiki 1960 female 0.560241
109 zhwiki 1960 male 0.423300
110 zhwiki 1960 nogender 0.069982
111 zhwiki 1970 female 0.684178
112 zhwiki 1970 male 0.489796
113 zhwiki 1970 nogender 0.272374
114 zhwiki 1980 female 0.720637
115 zhwiki 1980 male 0.413772
116 zhwiki 1980 nogender 0.387960

117 rows × 4 columns


In [15]:
natlangpiv = pd.pivot_table(celebdf, values='celeb_per', rows='decade', cols='wiki')

In [16]:
natlangpiv


Out[16]:
wiki dewiki enwiki jawiki kowiki tlwiki urwiki zhwiki
decade
1930 0.230318 0.188469 0.283101 0.311064 0.717719 0.496977 0.211123
1940 0.217475 0.173866 0.291716 0.448845 0.812707 0.457560 0.181520
1950 0.144332 0.186030 0.314639 0.446219 0.666099 0.474612 0.222047
1960 0.193429 0.247204 0.383604 0.584960 0.876718 0.343656 0.351174
1970 0.395148 0.296008 0.460160 0.639824 0.914757 0.550628 0.482116
1980 0.421298 0.340357 0.508095 0.594178 0.924142 0.377091 0.507456

In [ ]: